import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt # Interactive data viz package cmd: pip install altair
import pygwalker as pyg #Access tableau feature in python cmd: pip install pygwalker
from skimpy import skim, generate_test_data #Gives overall stats cmd: pip install skimpy
from summarytools import dfSummary #combination of info & describe fuctions. cmd: pip install summarytools
import sidetable # Better value_counts function cmd: pip install sidetable
import warnings
warnings.filterwarnings("ignore")
Access to safe drinking-water is essential to health, a basic human right and a component of effective policy for health protection. This is important as a health and development issue at a national, regional and local level. In some regions, it has been shown that investments in water supply and sanitation can yield a net economic benefit, since the reductions in adverse health effects and health care costs outweigh the costs of undertaking the interventions.
The water_potability.csv file contains water quality metrics for 3276 different water bodies.
PH is an important parameter in evaluating the acid–base balance of water. It is also the indicator of acidic or alkaline condition of water status. WHO has recommended maximum permissible limit of pH from 6.5 to 8.5. The current investigation ranges were 6.52–6.83 which are in the range of WHO standards.
Hardness is mainly caused by calcium and magnesium salts. These salts are dissolved from geologic deposits through which water travels. The length of time water is in contact with hardness producing material helps determine how much hardness there is in raw water. Hardness was originally defined as the capacity of water to precipitate soap caused by Calcium and Magnesium.
Water has the ability to dissolve a wide range of inorganic and some organic minerals or salts such as potassium, calcium, sodium, bicarbonates, chlorides, magnesium, sulfates etc. These minerals produced un-wanted taste and diluted color in appearance of water. This is the important parameter for the use of water. The water with high TDS value indicates that water is highly mineralized. Desirable limit for TDS is 500 mg/l and maximum limit is 1000 mg/l which prescribed for drinking purpose.
Chlorine and chloramine are the major disinfectants used in public water systems. Chloramines are most commonly formed when ammonia is added to chlorine to treat drinking water. Chlorine levels up to 4 milligrams per liter (mg/L or 4 parts per million (ppm)) are considered safe in drinking water.
Sulfates are naturally occurring substances that are found in minerals, soil, and rocks. They are present in ambient air, groundwater, plants, and food. The principal commercial use of sulfate is in the chemical industry. Sulfate concentration in seawater is about 2,700 milligrams per liter (mg/L). It ranges from 3 to 30 mg/L in most freshwater supplies, although much higher concentrations (1000 mg/L) are found in some geographic locations.
Pure water is not a good conductor of electric current rather’s a good insulator. Increase in ions concentration enhances the electrical conductivity of water. Generally, the amount of dissolved solids in water determines the electrical conductivity. Electrical conductivity (EC) actually measures the ionic process of a solution that enables it to transmit current. According to WHO standards, EC value should not exceeded 400 μS/cm.
Total Organic Carbon (TOC) in source waters comes from decaying natural organic matter (NOM) as well as synthetic sources. TOC is a measure of the total amount of carbon in organic compounds in pure water. According to US EPA < 2 mg/L as TOC in treated / drinking water, and < 4 mg/Lit in source water which is use for treatment.
THMs are chemicals which may be found in water treated with chlorine. The concentration of THMs in drinking water varies according to the level of organic material in the water, the amount of chlorine required to treat the water, and the temperature of the water that is being treated. THM levels up to 80 ppm is considered safe in drinking water.
The turbidity of water depends on the quantity of solid matter present in the suspended state. It is a measure of light emitting properties of water and the test is used to indicate the quality of waste discharge with respect to colloidal matter. The mean turbidity value obtained for Wondo Genet Campus (0.98 NTU) is lower than the WHO recommended value of 5.00 NTU.
Indicates if water is safe for human consumption where 1 means Potable and 0 means Not potable.
df = pd.read_csv("water_potability.csv")
df.head()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3276 entries, 0 to 3275 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ph 2785 non-null float64 1 Hardness 3276 non-null float64 2 Solids 3276 non-null float64 3 Chloramines 3276 non-null float64 4 Sulfate 2495 non-null float64 5 Conductivity 3276 non-null float64 6 Organic_carbon 3276 non-null float64 7 Trihalomethanes 3114 non-null float64 8 Turbidity 3276 non-null float64 9 Potability 3276 non-null int64 dtypes: float64(9), int64(1) memory usage: 256.1 KB
df.describe()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2785.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 2495.000000 | 3276.000000 | 3276.000000 | 3114.000000 | 3276.000000 | 3276.000000 |
| mean | 7.080795 | 196.369496 | 22014.092526 | 7.122277 | 333.775777 | 426.205111 | 14.284970 | 66.396293 | 3.966786 | 0.390110 |
| std | 1.594320 | 32.879761 | 8768.570828 | 1.583085 | 41.416840 | 80.824064 | 3.308162 | 16.175008 | 0.780382 | 0.487849 |
| min | 0.000000 | 47.432000 | 320.942611 | 0.352000 | 129.000000 | 181.483754 | 2.200000 | 0.738000 | 1.450000 | 0.000000 |
| 25% | 6.093092 | 176.850538 | 15666.690297 | 6.127421 | 307.699498 | 365.734414 | 12.065801 | 55.844536 | 3.439711 | 0.000000 |
| 50% | 7.036752 | 196.967627 | 20927.833607 | 7.130299 | 333.073546 | 421.884968 | 14.218338 | 66.622485 | 3.955028 | 0.000000 |
| 75% | 8.062066 | 216.667456 | 27332.762127 | 8.114887 | 359.950170 | 481.792304 | 16.557652 | 77.337473 | 4.500320 | 1.000000 |
| max | 14.000000 | 323.124000 | 61227.196008 | 13.127000 | 481.030642 | 753.342620 | 28.300000 | 124.000000 | 6.739000 | 1.000000 |
skim(df)
╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮ │ Data Summary Data Types │ │ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │ │ ┃ dataframe ┃ Values ┃ ┃ Column Type ┃ Count ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │ │ │ Number of rows │ 3276 │ │ float64 │ 9 │ │ │ │ Number of columns │ 10 │ │ int32 │ 1 │ │ │ └───────────────────┴────────┘ └─────────────┴───────┘ │ │ number │ │ ┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ │ │ ┃ column_name ┃ NA ┃ NA % ┃ mean ┃ sd ┃ p0 ┃ p25 ┃ p75 ┃ p100 ┃ hist ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ │ │ │ ph │ 490 │ 15 │ 7.1 │ 1.6 │ 0 │ 6.1 │ 8.1 │ 14 │ ▁██▁ │ │ │ │ Hardness │ 0 │ 0 │ 200 │ 33 │ 47 │ 180 │ 220 │ 320 │ ▁▅█▂ │ │ │ │ Solids │ 0 │ 0 │ 22000 │ 8800 │ 320 │ 16000 │ 27000 │ 61000 │ ▁█▇▂ │ │ │ │ Chloramines │ 0 │ 0 │ 7.1 │ 1.6 │ 0.35 │ 6.1 │ 8.1 │ 13 │ ▁▆█▂ │ │ │ │ Sulfate │ 780 │ 24 │ 330 │ 41 │ 130 │ 310 │ 360 │ 480 │ ▃█▃ │ │ │ │ Conductivity │ 0 │ 0 │ 430 │ 81 │ 180 │ 370 │ 480 │ 750 │ ▅█▅▁ │ │ │ │ Organic_carbon │ 0 │ 0 │ 14 │ 3.3 │ 2.2 │ 12 │ 17 │ 28 │ ▂█▆▁ │ │ │ │ Trihalomethanes │ 160 │ 4.9 │ 66 │ 16 │ 0.74 │ 56 │ 77 │ 120 │ ▁▆█▂ │ │ │ │ Turbidity │ 0 │ 0 │ 4 │ 0.78 │ 1.4 │ 3.4 │ 4.5 │ 6.7 │ ▃█▇▂ │ │ │ │ Potability │ 0 │ 0 │ 0.39 │ 0.49 │ 0 │ 0 │ 1 │ 1 │ █ ▅ │ │ │ └───────────────────────┴───────┴────────┴─────────┴────────┴────────┴─────────┴─────────┴─────────┴─────────┘ │ ╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯
From the following description of the dataset we observe:
def pot_label(p):
if(p == 0):
return 'No'
else:
return 'Yes'
df.Potability = df.Potability.apply(pot_label)
df.head()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | No |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | No |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | No |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | No |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | No |
dfSummary(df)
| No | Variable | Stats / Values | Freqs / (% of Valid) | Graph | Missing |
|---|---|---|---|---|---|
| 1 | ph [float64] |
Mean (sd) : 7.1 (1.6) min < med < max: 0.0 < 7.0 < 14.0 IQR (CV) : 2.0 (4.4) |
2,785 distinct values | 491 (15.0%) |
|
| 2 | Hardness [float64] |
Mean (sd) : 196.4 (32.9) min < med < max: 47.4 < 197.0 < 323.1 IQR (CV) : 39.8 (6.0) |
3,276 distinct values | 0 (0.0%) |
|
| 3 | Solids [float64] |
Mean (sd) : 22014.1 (8768.6) min < med < max: 320.9 < 20927.8 < 61227.2 IQR (CV) : 11666.1 (2.5) |
3,276 distinct values | 0 (0.0%) |
|
| 4 | Chloramines [float64] |
Mean (sd) : 7.1 (1.6) min < med < max: 0.4 < 7.1 < 13.1 IQR (CV) : 2.0 (4.5) |
3,276 distinct values | 0 (0.0%) |
|
| 5 | Sulfate [float64] |
Mean (sd) : 333.8 (41.4) min < med < max: 129.0 < 333.1 < 481.0 IQR (CV) : 52.3 (8.1) |
2,495 distinct values | 781 (23.8%) |
|
| 6 | Conductivity [float64] |
Mean (sd) : 426.2 (80.8) min < med < max: 181.5 < 421.9 < 753.3 IQR (CV) : 116.1 (5.3) |
3,276 distinct values | 0 (0.0%) |
|
| 7 | Organic_carbon [float64] |
Mean (sd) : 14.3 (3.3) min < med < max: 2.2 < 14.2 < 28.3 IQR (CV) : 4.5 (4.3) |
3,276 distinct values | 0 (0.0%) |
|
| 8 | Trihalomethanes [float64] |
Mean (sd) : 66.4 (16.2) min < med < max: 0.7 < 66.6 < 124.0 IQR (CV) : 21.5 (4.1) |
3,114 distinct values | 162 (4.9%) |
|
| 9 | Turbidity [float64] |
Mean (sd) : 4.0 (0.8) min < med < max: 1.4 < 4.0 < 6.7 IQR (CV) : 1.1 (5.1) |
3,276 distinct values | 0 (0.0%) |
|
| 10 | Potability [object] |
1. No 2. Yes |
1,998 (61.0%) 1,278 (39.0%) |
0 (0.0%) |
#Drop Viz?
sns.set_style("whitegrid")
a = sns.catplot('Potability',kind = 'count',data = df ,palette='autumn')
a.set(xlabel ="Water Potability?", ylabel = "count", title ='Potability of water')
<seaborn.axisgrid.FacetGrid at 0x20d4ba3f100>
In order to fill the missing values of pH, Sulfate & Trihalomethane columns, we will have to have a look at the distribution & based on that the appropiate approximation can be made.
#Drop viz?
chart = alt.Chart(df).mark_bar().encode(
alt.X('ph:Q',bin = True,axis = alt.Axis(title="pH value")),
alt.Y('count()', axis = alt.Axis(title="count"))).properties(title="Distribution of pH values")
chart
#Drop viz?
chart1 = alt.Chart(df).mark_bar().encode(
alt.X('Sulfate:Q',bin = True,axis = alt.Axis(title="Sulfate content (mg\L)")),
alt.Y('count()', axis = alt.Axis(title="count"))).properties(title="Distribution of Sulfate values")
chart1
#Drop viz?
chart2 = alt.Chart(df).mark_bar().encode(
alt.X('Trihalomethanes:Q',bin = True,axis = alt.Axis(title="Trihalomethanes content (μg/L)")),
alt.Y('count()', axis = alt.Axis(title="count"))).properties(title="Distribution of Trihalomethane values")
chart2
From the histograms of pH & Trihalomethanes, we can infer that the variables are normally distributed and Sulfates are somewhat left-skewed. Hence can take the mean values of pH & Trihalomethanes and the meadian of Sulfates variables.
df.ph.fillna(df.ph.mean(),inplace=True)
df.Sulfate.fillna(df.Sulfate.median(),inplace=True)
df.Trihalomethanes.fillna(df.Trihalomethanes.mean(),inplace=True)
df.to_csv('Cleaned_data.csv')
df.head()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.080795 | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | No |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | 333.073546 | 592.885359 | 15.180013 | 56.329076 | 4.500656 | No |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | 333.073546 | 418.606213 | 16.868637 | 66.420093 | 3.055934 | No |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | No |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | No |
skim(df)
╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮ │ Data Summary Data Types │ │ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │ │ ┃ dataframe ┃ Values ┃ ┃ Column Type ┃ Count ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │ │ │ Number of rows │ 3276 │ │ float64 │ 9 │ │ │ │ Number of columns │ 10 │ │ string │ 1 │ │ │ └───────────────────┴────────┘ └─────────────┴───────┘ │ │ number │ │ ┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓ │ │ ┃ column_name ┃ NA ┃ NA % ┃ mean ┃ sd ┃ p0 ┃ p25 ┃ p75 ┃ p100 ┃ hist ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩ │ │ │ ph │ 0 │ 0 │ 7.1 │ 1.5 │ 0 │ 6.3 │ 7.9 │ 14 │ ▁▆█▁ │ │ │ │ Hardness │ 0 │ 0 │ 200 │ 33 │ 47 │ 180 │ 220 │ 320 │ ▁▅█▂ │ │ │ │ Solids │ 0 │ 0 │ 22000 │ 8800 │ 320 │ 16000 │ 27000 │ 61000 │ ▁█▇▂ │ │ │ │ Chloramines │ 0 │ 0 │ 7.1 │ 1.6 │ 0.35 │ 6.1 │ 8.1 │ 13 │ ▁▆█▂ │ │ │ │ Sulfate │ 0 │ 0 │ 330 │ 36 │ 130 │ 320 │ 350 │ 480 │ ▂█▂ │ │ │ │ Conductivity │ 0 │ 0 │ 430 │ 81 │ 180 │ 370 │ 480 │ 750 │ ▅█▅▁ │ │ │ │ Organic_carbon │ 0 │ 0 │ 14 │ 3.3 │ 2.2 │ 12 │ 17 │ 28 │ ▂█▆▁ │ │ │ │ Trihalomethanes │ 0 │ 0 │ 66 │ 16 │ 0.74 │ 57 │ 77 │ 120 │ ▁▅█▂ │ │ │ │ Turbidity │ 0 │ 0 │ 4 │ 0.78 │ 1.4 │ 3.4 │ 4.5 │ 6.7 │ ▃█▇▂ │ │ │ └───────────────────────┴──────┴────────┴──────────┴────────┴────────┴─────────┴─────────┴─────────┴─────────┘ │ │ string │ │ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ │ │ ┃ column_name ┃ NA ┃ NA % ┃ words per row ┃ total words ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ │ │ Potability │ 0 │ 0 │ 1 │ 3300 │ │ │ └───────────────────────────┴─────────┴────────────┴──────────────────────────────┴──────────────────────────┘ │ ╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯
We now have a complete dataset & can begin our analysis.
sns.boxplot(data=df, x="ph",y = 'Potability')
plt.title("Distribution of pH based on Potability of water")
plt.xlabel("pH")
Text(0.5, 0, 'pH')
fig, (ax_box, ax_violin) = plt.subplots(nrows=1, ncols=2, figsize=(13, 5))
sns.boxplot(x="ph", y="Potability", data=df, ax=ax_box)
sns.violinplot(x="ph", y="Potability", data=df, ax=ax_violin)
ax_box.set_title("Boxplot of pH distribution")
ax_violin.set_title("Violinplot of pH distribution")
Text(0.5, 1.0, 'Violinplot of pH distribution')
fig, (ax_box1, ax_violin1) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5.5))
sns.boxplot(data=df[['Chloramines','Organic_carbon','Turbidity']],ax=ax_box1,orient='h')
sns.violinplot(data=df[['Chloramines','Organic_carbon','Turbidity']],ax=ax_violin1,orient='h')
ax_box1.set_title("Boxplot of Chloramines(ppm), Organic Carbon(ppm) & Turbidity(NTU)")
ax_violin1.set_title("Violinplot of Chloramines(ppm), Organic Carbon(ppm) & Turbidity(NTU)")
Text(0.5, 1.0, 'Violinplot of Chloramines(ppm), Organic Carbon(ppm) & Turbidity(NTU)')
fig, (ax_box2, ax_violin2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5.5))
sns.boxplot(data=df[['Sulfate','Conductivity']],orient="h",ax=ax_box2)
sns.violinplot(data=df[['Sulfate','Conductivity']],ax=ax_violin2,orient='h')
ax_box2.set_title("Boxplot of Sulfates(mg/L) & Conductivity(μS/cm)")
ax_violin2.set_title("Violinplot of Sulfates(mg/L) & Conductivity(μS/cm)")
Text(0.5, 1.0, 'Violinplot of Sulfates(mg/L) & Conductivity(μS/cm)')
fig, (ax_box3, ax_violin3) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5.5))
sns.boxplot(data=df.Solids,orient="h",ax=ax_box3)
sns.violinplot(data = df.Solids,orient="h",ax=ax_violin3)
ax_box3.set_title("Boxplot distribution of Total Dissolved Solids(ppm)")
ax_violin3.set_title("Violinplot distribution of Total Dissolved Solids(ppm)")
ax_box3.set_xlabel("Sulfate Amount(ppm)")
ax_violin3.set_xlabel("Sulfate Amount(ppm)")
Text(0.5, 0, 'Sulfate Amount(ppm)')
alt.Chart(df).mark_circle(size=60).encode(
x = alt.X('ph',axis=alt.Axis(title="pH")),
y = alt.Y('Hardness',axis=alt.Axis(title="Hardness(mg/L)")),
color=alt.Color('Potability:O',scale = alt.Scale(scheme='magma')),
tooltip=['Solids:Q','Chloramines:Q','Sulfate:Q','Conductivity:Q','Organic_carbon:Q','Trihalomethanes:Q','Turbidity:Q']
).properties(title="pH vs. Hardness").interactive()
#Double tap graph to re-center
alt.Chart(df).mark_circle(size=60).encode(
x = alt.X('ph',axis=alt.Axis(title = "pH")),
y = alt.Y('Solids',axis = alt.Axis(title = 'TDS amount(ppm)')),
color = alt.Color('Potability:O',scale = alt.Scale(scheme='set1')),
tooltip = ['Hardness:Q','Chloramines:Q','Sulfate:Q','Conductivity:Q','Organic_carbon:Q','Trihalomethanes:Q','Turbidity:Q']
).properties(title="pH vs. Total Dissolved Solids").interactive()
alt.Chart(df).mark_circle(size=60).encode(
x = alt.X('ph',axis=alt.Axis(title = "pH")),
y = alt.Y('Chloramines',axis = alt.Axis(title = 'Chloramine amount(ppm)')),
color = alt.Color('Potability:O',scale = alt.Scale(scheme='category10')),
tooltip = ['Hardness:Q','Solids:Q','Sulfate:Q','Conductivity:Q','Organic_carbon:Q','Trihalomethanes:Q','Turbidity:Q']
).properties(title="pH vs. Chloramines").interactive()
alt.Chart(df).mark_circle(size=60).encode(
x = alt.X('ph',axis=alt.Axis(title = "pH")),
y = alt.Y('Sulfate',axis = alt.Axis(title = 'Sulfate amount(mg/L)')),
color = alt.Color('Potability:O',scale = alt.Scale(scheme='dark2')),
tooltip = ['Hardness:Q','Solids:Q','Chloramines:Q','Conductivity:Q','Organic_carbon:Q','Trihalomethanes:Q','Turbidity:Q']
).properties(title="pH vs. Sulfates").interactive()
alt.Chart(df).mark_circle(size=60).encode(
x = alt.X('ph',axis=alt.Axis(title = "pH")),
y = alt.Y('Conductivity',axis = alt.Axis(title = 'Conductivity of water(μS/cm)')),
color = alt.Color('Potability:O',scale = alt.Scale(scheme='viridis')),
tooltip = ['Hardness:Q','Solids:Q','Chloramines:Q','Sulfates:Q','Organic_carbon:Q','Trihalomethanes:Q','Turbidity:Q']
).properties(title="pH vs. Conductivity").interactive()
alt.Chart(df).mark_circle(size=60).encode(
x = alt.X('ph',axis=alt.Axis(title = "pH")),
y = alt.Y('Organic_carbon',axis = alt.Axis(title = 'Amount of Organic Carbon(ppm)')),
color = alt.Color('Potability:O',scale = alt.Scale(scheme='set2')),
tooltip = ['Hardness:Q','Solids:Q','Chloramines:Q','Sulfates:Q','Conductivity:Q','Trihalomethanes:Q','Turbidity:Q']
).properties(title="pH vs. Organic Carbon").interactive()
alt.Chart(df).mark_circle(size=60).encode(
x = alt.X('ph',axis=alt.Axis(title = "pH")),
y = alt.Y('Trihalomethanes',axis = alt.Axis(title = 'Trihalomethanes amount(μg/L)')),
color = alt.Color('Potability:O',scale = alt.Scale(scheme='magma')),
tooltip = ['Hardness:Q','Solids:Q','Chloramines:Q','Sulfates:Q','Conductivity:Q','Organic_carbon:Q','Turbidity:Q']
).properties(title="pH vs. Trihalomethanes").interactive()
alt.Chart(df).mark_circle(size=60).encode(
x = alt.X('ph',axis=alt.Axis(title = "pH")),
y = alt.Y('Turbidity',axis = alt.Axis(title = 'Turbidity(NTU)')),
color = alt.Color('Potability:O',scale = alt.Scale(scheme='darkmulti')),
tooltip = ['Hardness:Q','Solids:Q','Chloramines:Q','Sulfates:Q','Conductivity:Q','Organic_carbon:Q','Turbidity:Q']
).properties(title="pH vs. Turbidity").interactive()
On observing the plots of our dataset, we notice that there are pH values that do not make sense. From our research we know that the ideal pH range of water that is drinkable is between 6.5 - 8.5. However, this is not seen from the given dataset. For instance, the data states that a pH value of 4 is drinkable.There are also instances where the pH value is greater than 10 and is marked as drinkable.
corr_matrix = df.corr()
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True, fmt='.2f', linewidths=0.5)
<AxesSubplot:>
From the following box and violin plots, we can notice there are many outliers present in our dataset.
Outlier treatment - Since this is a synthetic data, they have no real significance and also because the features are uncorrelated, we will clip the outlier values to better fit the model. Reference - https://datascience.stackexchange.com/questions/65802/for-outliers-treatment-clipping-winsorizing-or-removing
Q1 = df['ph'].quantile(0.05)
Q2 = df['ph'].quantile(0.95)
df['ph'] = df['ph'].clip(lower = Q1, upper = Q2)
Q1 = df['Hardness'].quantile(0.05)
Q2 = df['Hardness'].quantile(0.95)
df['Hardness'] = df['Hardness'].clip(lower = Q1, upper = Q2)
Q1 = df['Solids'].quantile(0.05)
Q2 = df['Solids'].quantile(0.95)
df['Solids'] = df['Solids'].clip(lower = Q1, upper = Q2)
Q1 = df['Chloramines'].quantile(0.05)
Q2 = df['Chloramines'].quantile(0.95)
df['Chloramines'] = df['Chloramines'].clip(lower = Q1, upper = Q2)
Q1 = df['Sulfate'].quantile(0.05)
Q2 = df['Sulfate'].quantile(0.95)
df['Sulfate'] = df['Sulfate'].clip(lower = Q1, upper = Q2)
Q1 = df['Conductivity'].quantile(0.05)
Q2 = df['Conductivity'].quantile(0.95)
df['Conductivity'] = df['Conductivity'].clip(lower = Q1, upper = Q2)
Q1 = df['Organic_carbon'].quantile(0.05)
Q2 = df['Organic_carbon'].quantile(0.95)
df['Organic_carbon'] = df['Organic_carbon'].clip(lower = Q1, upper = Q2)
Q1 = df['Trihalomethanes'].quantile(0.05)
Q2 = df['Trihalomethanes'].quantile(0.95)
df['Trihalomethanes'] = df['Trihalomethanes'].clip(lower = Q1, upper = Q2)
Q1 = df['Turbidity'].quantile(0.05)
Q2 = df['Turbidity'].quantile(0.95)
df['Turbidity'] = df['Turbidity'].clip(lower = Q1, upper = Q2)
#Saved outlier-free data
df.to_csv('Outlier_treated_data.csv')
from sklearn.preprocessing import StandardScaler
X1 = df.copy()
X1 = X1.drop('Potability',axis=1)
scaler = StandardScaler()
scaler.fit(X1)
Xa = scaler.transform(X1)
Xa[0:5]
array([[-3.31823713e-03, 2.95445357e-01, -1.39254296e-01,
1.22913381e-01, 1.13490507e+00, 1.84182066e+00,
-1.29603398e+00, 1.46127900e+00, -1.41117235e+00],
[-1.87768215e+00, -1.88370849e+00, -4.09992788e-01,
-3.48132686e-01, -2.14645385e-02, 1.86891055e+00,
2.99743056e-01, -7.15347231e-01, 7.51401172e-01],
[ 7.94661459e-01, 9.63263664e-01, -2.49713435e-01,
1.52243217e+00, -2.14645385e-02, -9.23829495e-02,
8.61105132e-01, 9.93726616e-04, -1.28064841e+00],
[ 9.65209029e-01, 6.22796964e-01, 1.44628014e-02,
6.60656263e-01, 7.55451534e-01, -8.27018833e-01,
1.38232994e+00, 1.79872922e+00, 9.31598080e-01],
[ 1.57287019e+00, -5.25750488e-01, -4.91551707e-01,
-4.10927337e-01, -7.69839745e-01, -3.60477394e-01,
-9.04257456e-01, -1.88117169e+00, 1.52807760e-01]])
from statsmodels.stats.outliers_influence import variance_inflation_factor
#dfq = pd.DataFrame(Xa,columns=df.columns)
dfq = pd.DataFrame(Xa,columns=X1.columns)
dfq= dfq.dropna()
# Select only the predictor variables you want to check for multicollinearity
predictors = dfq[['ph','Hardness','Solids','Chloramines','Sulfate','Conductivity','Organic_carbon','Trihalomethanes','Turbidity']]
# Create a numpy array from the predictor variables
X = np.array(predictors)
# Calculate the VIF values for each predictor variable
vif = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
# Print the VIF values
print(vif)
[1.0199647537705459, 1.0217545715528653, 1.0325165306406208, 1.0075228023572673, 1.0293293618560084, 1.0026977082001023, 1.0036444941462002, 1.0023438017002047, 1.0039011393372756]
From this part we understand that there is no multicolinearity within the features.
dfq.head()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.003318 | 0.295445 | -0.139254 | 0.122913 | 1.134905 | 1.841821 | -1.296034 | 1.461279 | -1.411172 |
| 1 | -1.877682 | -1.883708 | -0.409993 | -0.348133 | -0.021465 | 1.868911 | 0.299743 | -0.715347 | 0.751401 |
| 2 | 0.794661 | 0.963264 | -0.249713 | 1.522432 | -0.021465 | -0.092383 | 0.861105 | 0.000994 | -1.280648 |
| 3 | 0.965209 | 0.622797 | 0.014463 | 0.660656 | 0.755452 | -0.827019 | 1.382330 | 1.798729 | 0.931598 |
| 4 | 1.572870 | -0.525750 | -0.491552 | -0.410927 | -0.769840 | -0.360477 | -0.904257 | -1.881172 | 0.152808 |
X1 = X1.dropna()
X1
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.080795 | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 |
| 1 | 4.688853 | 141.763281 | 18630.057858 | 6.635246 | 333.073546 | 566.349320 | 15.180013 | 56.329076 | 4.500656 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | 333.073546 | 418.606213 | 16.868637 | 66.420093 | 3.055934 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 91.744595 | 4.628771 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 39.906235 | 4.075075 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3271 | 4.688853 | 193.681735 | 38474.990249 | 7.166639 | 359.948574 | 526.424171 | 13.894419 | 66.687695 | 4.435821 |
| 3272 | 7.808856 | 193.553212 | 17329.802160 | 8.061362 | 333.073546 | 392.449580 | 19.637254 | 66.396293 | 2.798243 |
| 3273 | 9.419510 | 175.762646 | 33155.578218 | 7.350233 | 333.073546 | 432.044783 | 11.039070 | 69.845400 | 3.298875 |
| 3274 | 5.126763 | 230.603758 | 11983.869376 | 6.303357 | 333.073546 | 402.883113 | 11.168946 | 77.488213 | 4.708658 |
| 3275 | 7.874671 | 195.102299 | 17404.177061 | 7.509306 | 333.073546 | 327.459760 | 16.140368 | 78.698446 | 2.684279 |
3276 rows × 9 columns
#Convert back for model building
def num_pot_label(p):
if(p == 'No'):
return 0
else:
return 1
df.Potability = df.Potability.apply(num_pot_label)
df.head()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.080795 | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 4.688853 | 141.763281 | 18630.057858 | 6.635246 | 333.073546 | 566.349320 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | 333.073546 | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 91.744595 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 39.906235 | 4.075075 | 0 |
#Blocks for data reduction in order to build models.
df1 = pd.read_csv("Outlier_treated_data.csv")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
X = df[["ph","Hardness","Solids","Chloramines","Sulfate","Conductivity","Organic_carbon","Trihalomethanes","Turbidity"]]
Y = df.Potability
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
log_reg = LogisticRegression()
log_reg.fit(X_train,Y_train)
y_pred = log_reg.predict(X_test)
print(y_pred[0:5],"\n")
print(Y_test.head())
[0 0 0 0 0] 2017 1 2533 0 589 0 482 0 2620 0 Name: Potability, dtype: int64
conf_matrix = confusion_matrix(Y_test, y_pred)
print ("Confusion Matrix :\n", conf_matrix,"\n\n")
log_reg_accuracy = accuracy_score(Y_test, y_pred)
Confusion Matrix : [[502 0] [317 0]]
sns.set(font_scale=1.4)
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues')
plt.title('Confusion matrix of Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print (f"Model accuracy :{round(log_reg_accuracy,2)*100}%")
Model accuracy :61.0%
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
dtree = DecisionTreeClassifier()
dtree.fit(X_train, Y_train)
dtree_y_pred = dtree.predict(X_test)
dtree_accuracy = accuracy_score(Y_test, dtree_y_pred)
print("Accuracy: {:.2f}%".format(dtree_accuracy * 100))
Accuracy: 55.31%
dtree_conf_matrix = confusion_matrix(Y_test, dtree_y_pred)
print(dtree_conf_matrix)
[[322 180] [186 131]]
sns.set(font_scale=1.4)
sns.heatmap(dtree_conf_matrix, annot=True, fmt='g', cmap='icefire')
plt.title('Confusion matrix of KNN classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f"Model Accuracy of KNN :{round(dtree_accuracy*100,2)}%")
Model Accuracy of KNN :55.31%
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)
knn_acc = accuracy_score(Y_test, y_pred)
# Create a confusion matrix
cm = confusion_matrix(Y_test, y_pred)
print(cm)
[[363 139] [230 87]]
sns.set(font_scale=1.4)
sns.heatmap(cm, annot=True, fmt='g', cmap='YlOrBr')
plt.title('Confusion matrix of KNN classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f"Model Accuracy of KNN :{round(knn_acc*100,2)}%")
Model Accuracy of KNN :54.95%
from sklearn.ensemble import RandomForestClassifier
# Create a Random Forest model with 100 trees
rf_model = RandomForestClassifier(n_estimators=100)
# Fit the model to the training data
rf_model.fit(X_train, Y_train)
# Make predictions on the test data
y_pred = rf_model.predict(X_test)
# Calculate the accuracy score
rf_acc = accuracy_score(Y_test, y_pred)
rf_y_pred = rf_model.predict(X_test)
rf_cm = confusion_matrix(Y_test, rf_y_pred)
print(rf_cm)
[[444 58] [218 99]]
sns.set(font_scale=1.4)
sns.heatmap(rf_cm, annot=True, fmt='g', cmap='coolwarm')
plt.title('Confusion matrix of Random Forest classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f"Model Accuracy of Random Forest is : {round(rf_acc*100,2)}%")
Model Accuracy of Random Forest is : 66.3%
import xgboost as xgb
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, Y_train)
y_pred = xgb_model.predict(X_test)
xg_acc = accuracy_score(Y_test, y_pred)
xg_y_pred = xgb_model.predict(X_test)
xg_cm = confusion_matrix(Y_test, y_pred)
print(xg_cm)
[[400 102] [192 125]]
sns.set(font_scale=1.4)
sns.heatmap(xg_cm, annot=True, fmt='g', cmap='rocket')
plt.title('Confusion matrix of XGBoost classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print(f"Model Accuracy of XGBoost : {round(xg_acc*100,2)}%")
Model Accuracy of XGBoost : 64.1%